From 91006d5acb030cd3b35730c710452efd1aefe029 Mon Sep 17 00:00:00 2001 From: Tim Starling Date: Sun, 6 Aug 2006 14:01:47 +0000 Subject: [PATCH] Optimised Sanitizer::removeHTMLtags, Parser::unstrip, Parser::doMagicLinks, Parser::replaceInternalLinks and Parser::replace_callback. Introduced fast abridged entry point Parser::recursiveTagParse, for recursive parsing called from Cite.php and similar extensions. Resolved a known fail parser test. --- includes/Parser.php | 611 +++++++++++++++++------------------- includes/Sanitizer.php | 124 ++++---- maintenance/parserTests.txt | 2 +- 3 files changed, 352 insertions(+), 385 deletions(-) diff --git a/includes/Parser.php b/includes/Parser.php index dd8415469d..a13a0ac998 100644 --- a/includes/Parser.php +++ b/includes/Parser.php @@ -187,6 +187,7 @@ class Parser * @private */ function clearState() { + wfProfileIn( __METHOD__ ); if ( $this->mFirstCall ) { $this->firstCallInit(); } @@ -228,6 +229,7 @@ class Parser $this->mForceTocPosition = false; wfRunHooks( 'ParserClearState', array( &$this ) ); + wfProfileOut( __METHOD__ ); } /** @@ -235,7 +237,7 @@ class Parser * * @public */ - function UniqPrefix() { + function uniqPrefix() { return $this->mUniqPrefix; } @@ -282,12 +284,6 @@ class Parser $text = $this->strip( $text, $x ); wfRunHooks( 'ParserAfterStrip', array( &$this, &$text, &$x ) ); - # Hook to suspend the parser in this state - if ( !wfRunHooks( 'ParserBeforeInternalParse', array( &$this, &$text, &$x ) ) ) { - wfProfileOut( $fname ); - return $text ; - } - $text = $this->internalParse( $text ); $text = $this->unstrip( $text, $this->mStripState ); @@ -358,6 +354,21 @@ class Parser return $this->mOutput; } + /** + * Recursive parser entry point that can be called from an extension tag + * hook. + */ + function recursiveTagParse( $text ) { + wfProfileIn( __METHOD__ ); + $x =& $this->mStripState; + wfRunHooks( 'ParserBeforeStrip', array( &$this, &$text, &$x ) ); + $text = $this->strip( $text, $x ); + wfRunHooks( 'ParserAfterStrip', array( &$this, &$text, &$x ) ); + $text = $this->internalParse( $text ); + wfProfileOut( __METHOD__ ); + return $text; + } + /** * Get a random string * @@ -395,8 +406,7 @@ class Parser * @static */ function extractTagsAndParams($elements, $text, &$matches, $uniq_prefix = ''){ - $rand = Parser::getRandomString(); - $n = 1; + static $n = 1; $stripped = ''; $matches = array(); @@ -423,7 +433,7 @@ class Parser $inside = $p[4]; } - $marker = "$uniq_prefix-$element-$rand" . sprintf('%08X', $n++) . '-QINU'; + $marker = "$uniq_prefix-$element-" . sprintf('%08X', $n++) . '-QINU'; $stripped .= $marker; if ( $close === '/>' ) { @@ -474,6 +484,7 @@ class Parser * @private */ function strip( $text, &$state, $stripcomments = false , $dontstrip = array () ) { + wfProfileIn( __METHOD__ ); $render = ($this->mOutputType == OT_HTML); # Replace any instances of the placeholders @@ -505,6 +516,7 @@ class Parser list( $element, $content, $params, $tag ) = $data; if( $render ) { $tagName = strtolower( $element ); + wfProfileIn( __METHOD__."-render-$tagName" ); switch( $tagName ) { case '!--': // Comment @@ -539,14 +551,22 @@ class Parser throw new MWException( "Invalid call hook $element" ); } } + wfProfileOut( __METHOD__."-render-$tagName" ); } else { // Just stripping tags; keep the source $output = $tag; } + + // Unstrip the output, because unstrip() is no longer recursive so + // it won't do it itself + $output = $this->unstrip( $output, $state ); + if( !$stripcomments && $element == '!--' ) { $commentState[$marker] = $output; + } elseif ( $element == 'html' || $element == 'nowiki' ) { + $state['nowiki'][$marker] = $output; } else { - $state[$element][$marker] = $output; + $state['general'][$marker] = $output; } } @@ -559,6 +579,7 @@ class Parser $text = strtr( $text, $commentState ); } + wfProfileOut( __METHOD__ ); return $text; } @@ -569,20 +590,14 @@ class Parser * @private */ function unstrip( $text, &$state ) { - if ( !is_array( $state ) ) { + if ( !isset( $state['general'] ) ) { return $text; } - $replacements = array(); - foreach( $state as $tag => $contentDict ) { - if( $tag != 'nowiki' && $tag != 'html' ) { - foreach( $contentDict as $uniq => $content ) { - $replacements[$uniq] = $content; - } - } - } - $text = strtr( $text, $replacements ); - + wfProfileIn( __METHOD__ ); + # TODO: good candidate for FSS + $text = strtr( $text, $state['general'] ); + wfProfileOut( __METHOD__ ); return $text; } @@ -592,20 +607,15 @@ class Parser * @private */ function unstripNoWiki( $text, &$state ) { - if ( !is_array( $state ) ) { + if ( !isset( $state['nowiki'] ) ) { return $text; } - $replacements = array(); - foreach( $state as $tag => $contentDict ) { - if( $tag == 'nowiki' || $tag == 'html' ) { - foreach( $contentDict as $uniq => $content ) { - $replacements[$uniq] = $content; - } - } - } - $text = strtr( $text, $replacements ); - + wfProfileIn( __METHOD__ ); + # TODO: good candidate for FSS + $text = strtr( $text, $state['nowiki'] ); + wfProfileOut( __METHOD__ ); + return $text; } @@ -621,7 +631,7 @@ class Parser if ( !$state ) { $state = array(); } - $state['item'][$rnd] = $text; + $state['general'][$rnd] = $text; return $rnd; } @@ -881,6 +891,12 @@ class Parser $fname = 'Parser::internalParse'; wfProfileIn( $fname ); + # Hook to suspend the parser in this state + if ( !wfRunHooks( 'ParserBeforeInternalParse', array( &$this, &$text, &$x ) ) ) { + wfProfileOut( $fname ); + return $text ; + } + # Remove tags and sections $text = strtr( $text, array( '' => '' , '' => '' ) ); $text = strtr( $text, array( '' => '', '' => '') ); @@ -927,9 +943,52 @@ class Parser * @private */ function &doMagicLinks( &$text ) { - $text = $this->magicISBN( $text ); - $text = $this->magicRFC( $text, 'RFC ', 'rfcurl' ); - $text = $this->magicRFC( $text, 'PMID ', 'pubmedurl' ); + wfProfileIn( __METHOD__ ); + $text = preg_replace_callback( + '!(?: # Start cases + # Skip link text + <.*?> | # Skip stuff inside HTML elements + (?:RFC|PMID)\s+([0-9]+) | # RFC or PMID, capture number as m[1] + ISBN\s+([0-9Xx-]+) # ISBN, capture number as m[2] + )!x', array( &$this, 'magicLinkCallback' ), $text ); + wfProfileOut( __METHOD__ ); + return $text; + } + + function magicLinkCallback( $m ) { + if ( substr( $m[0], 0, 1 ) == '<' ) { + # Skip HTML element + return $m[0]; + } elseif ( substr( $m[0], 0, 4 ) == 'ISBN' ) { + $isbn = $m[2]; + $num = strtr( $isbn, array( + '-' => '', + ' ' => '', + 'x' => 'X', + )); + $titleObj = Title::makeTitle( NS_SPECIAL, 'Booksources' ); + $text = 'ISBN $isbn"; + } else { + if ( substr( $m[0], 0, 3 ) == 'RFC' ) { + $keyword = 'RFC'; + $urlmsg = 'rfcurl'; + $id = $m[1]; + } elseif ( substr( $m[0], 0, 4 ) == 'PMID' ) { + $keyword = 'PMID'; + $urlmsg = 'pubmedurl'; + $id = $m[1]; + } else { + throw new MWException( __METHOD__.': unrecognised match type "' . + substr($m[0], 0, 20 ) . '"' ); + } + + $url = wfMsg( $urlmsg, $id); + $sk =& $this->mOptions->getSkin(); + $la = $sk->getExternalLinkAttributes( $url, $keyword.$id ); + $text = "{$keyword} {$id}"; + } return $text; } @@ -1391,7 +1450,7 @@ class Parser $useLinkPrefixExtension = $wgContLang->linkPrefixExtension(); if( is_null( $this->mTitle ) ) { - throw new MWException( 'nooo' ); + throw new MWException( __METHOD__.": \$this->mTitle is null\n" ); } $nottalk = !$this->mTitle->isTalkPage(); @@ -1406,10 +1465,9 @@ class Parser } $selflink = $this->mTitle->getPrefixedText(); - wfProfileOut( $fname.'-setup' ); - $checkVariantLink = sizeof($wgContLang->getVariants())>1; $useSubpages = $this->areSubpagesAllowed(); + wfProfileOut( $fname.'-setup' ); # Loop for each link for ($k = 0; isset( $a[$k] ); $k++) { @@ -1432,6 +1490,7 @@ class Parser $might_be_img = false; + wfProfileIn( "$fname-e1" ); if ( preg_match( $e1, $line, $m ) ) { # page with normal text or alt $text = $m[2]; # If we get a ] at the beginning of $m[3] that means we have a link that's something like: @@ -1443,27 +1502,33 @@ class Parser # and no image is in sight. See bug 2095. # if( $text !== '' && - preg_match( "/^\](.*)/s", $m[3], $n ) && + substr( $m[3], 0, 1 ) === ']' && strpos($text, '[') !== false ) { $text .= ']'; # so that replaceExternalLinks($text) works later - $m[3] = $n[1]; + $m[3] = substr( $m[3], 1 ); } # fix up urlencoded title texts - if(preg_match('/%/', $m[1] )) + if( strpos( $m[1], '%' ) !== false ) { # Should anchors '#' also be rejected? $m[1] = str_replace( array('<', '>'), array('<', '>'), urldecode($m[1]) ); + } $trail = $m[3]; } elseif( preg_match($e1_img, $line, $m) ) { # Invalid, but might be an image with a link in its caption $might_be_img = true; $text = $m[2]; - if(preg_match('/%/', $m[1] )) $m[1] = urldecode($m[1]); + if ( strpos( $m[1], '%' ) !== false ) { + $m[1] = urldecode($m[1]); + } $trail = ""; } else { # Invalid form; output directly $s .= $prefix . '[[' . $line ; + wfProfileOut( "$fname-e1" ); continue; } + wfProfileOut( "$fname-e1" ); + wfProfileIn( "$fname-misc" ); # Don't allow internal links to pages containing # PROTO: where PROTO is a valid URL protocol; these @@ -1486,9 +1551,12 @@ class Parser $link = substr($link, 1); } + wfProfileOut( "$fname-misc" ); + wfProfileIn( "$fname-title" ); $nt = Title::newFromText( $this->unstripNoWiki($link, $this->mStripState) ); if( !$nt ) { $s .= $prefix . '[[' . $line; + wfProfileOut( "$fname-title" ); continue; } @@ -1501,23 +1569,26 @@ class Parser $ns = $nt->getNamespace(); $iw = $nt->getInterWiki(); - + wfProfileOut( "$fname-title" ); + if ($might_be_img) { # if this is actually an invalid link + wfProfileIn( "$fname-might_be_img" ); if ($ns == NS_IMAGE && $noforce) { #but might be an image $found = false; while (isset ($a[$k+1]) ) { #look at the next 'line' to see if we can close it there $spliced = array_splice( $a, $k + 1, 1 ); $next_line = array_shift( $spliced ); - if( preg_match("/^(.*?]].*?)]](.*)$/sD", $next_line, $m) ) { - # the first ]] closes the inner link, the second the image + $m = explode( ']]', $next_line, 3 ); + if ( count( $m ) == 3 ) { + # the first ]] closes the inner link, the second the image $found = true; - $text .= '[[' . $m[1]; + $text .= "[[{$m[0]}]]{$m[1]}"; $trail = $m[2]; break; - } elseif( preg_match("/^.*?]].*$/sD", $next_line, $m) ) { + } elseif ( count( $m ) == 2 ) { #if there's exactly one ]] that's fine, we'll keep looking - $text .= '[[' . $m[0]; + $text .= "[[{$m[0]}]]{$m[1]}"; } else { #if $next_line is invalid too, we need look no further $text .= '[[' . $next_line; @@ -1528,31 +1599,36 @@ class Parser # we couldn't find the end of this imageLink, so output it raw #but don't ignore what might be perfectly normal links in the text we've examined $text = $this->replaceInternalLinks($text); - $s .= $prefix . '[[' . $link . '|' . $text; + $s .= "{$prefix}[[$link|$text"; # note: no $trail, because without an end, there *is* no trail + wfProfileOut( "$fname-might_be_img" ); continue; } } else { #it's not an image, so output it raw - $s .= $prefix . '[[' . $link . '|' . $text; + $s .= "{$prefix}[[$link|$text"; # note: no $trail, because without an end, there *is* no trail + wfProfileOut( "$fname-might_be_img" ); continue; } + wfProfileOut( "$fname-might_be_img" ); } $wasblank = ( '' == $text ); if( $wasblank ) $text = $link; - # Link not escaped by : , create the various objects if( $noforce ) { # Interwikis + wfProfileIn( "$fname-interwiki" ); if( $iw && $this->mOptions->getInterwikiMagic() && $nottalk && $wgContLang->getLanguageName( $iw ) ) { $this->mOutput->addLanguageLink( $nt->getFullText() ); $s = rtrim($s . "\n"); $s .= trim($prefix . $trail, "\n") == '' ? '': $prefix . $trail; + wfProfileOut( "$fname-interwiki" ); continue; } + wfProfileOut( "$fname-interwiki" ); if ( $ns == NS_IMAGE ) { wfProfileIn( "$fname-image" ); @@ -1642,11 +1718,12 @@ class Parser /** * Make a link placeholder. The text returned can be later resolved to a real link with * replaceLinkHolders(). This is done for two reasons: firstly to avoid further - * parsing of interwiki links, and secondly to allow all extistence checks and + * parsing of interwiki links, and secondly to allow all existence checks and * article length checks (for stub links) to be bundled into a single query. * */ function makeLinkHolder( &$nt, $text = '', $query = '', $trail = '', $prefix = '' ) { + wfProfileIn( __METHOD__ ); if ( ! is_object($nt) ) { # Fail gracefully $retVal = "{$prefix}{$text}{$trail}"; @@ -1668,6 +1745,7 @@ class Parser $retVal = '{$trail}"; } } + wfProfileOut( __METHOD__ ); return $retVal; } @@ -2355,172 +2433,164 @@ class Parser * '{' => array( # opening parentheses * 'end' => '}', # closing parentheses * 'cb' => array(2 => callback, # replacement callback to call if {{..}} is found - * 4 => callback # replacement callback to call if {{{{..}}}} is found + * 3 => callback # replacement callback to call if {{{..}}} is found * ) * ) + * 'min' => 2, # Minimum parenthesis count in cb + * 'max' => 3, # Maximum parenthesis count in cb * @private */ function replace_callback ($text, $callbacks) { - wfProfileIn( __METHOD__ . '-self' ); + wfProfileIn( __METHOD__ ); $openingBraceStack = array(); # this array will hold a stack of parentheses which are not closed yet $lastOpeningBrace = -1; # last not closed parentheses - for ($i = 0; $i < strlen($text); $i++) { - # check for any opening brace - $rule = null; - $nextPos = -1; - foreach ($callbacks as $key => $value) { - $pos = strpos ($text, $key, $i); - if (false !== $pos && (-1 == $nextPos || $pos < $nextPos)) { - $rule = $value; - $nextPos = $pos; - } + $validOpeningBraces = implode( '', array_keys( $callbacks ) ); + + $i = 0; + while ( $i < strlen( $text ) ) { + # Find next opening brace, closing brace or pipe + if ( $lastOpeningBrace == -1 ) { + $currentClosing = ''; + $search = $validOpeningBraces; + } else { + $currentClosing = $openingBraceStack[$lastOpeningBrace]['braceEnd']; + $search = $validOpeningBraces . '|' . $currentClosing; } - - if ($lastOpeningBrace >= 0) { - $pos = strpos ($text, $openingBraceStack[$lastOpeningBrace]['braceEnd'], $i); - - if (false !== $pos && (-1 == $nextPos || $pos < $nextPos)){ - $rule = null; - $nextPos = $pos; - } - - $pos = strpos ($text, '|', $i); - - if (false !== $pos && (-1 == $nextPos || $pos < $nextPos)){ - $rule = null; - $nextPos = $pos; + $rule = null; + $i += strcspn( $text, $search, $i ); + if ( $i < strlen( $text ) ) { + if ( $text[$i] == '|' ) { + $found = 'pipe'; + } elseif ( $text[$i] == $currentClosing ) { + $found = 'close'; + } else { + $found = 'open'; + $rule = $callbacks[$text[$i]]; } - } - - if ($nextPos == -1) + } else { + # All done break; + } - $i = $nextPos; - - # found openning brace, lets add it to parentheses stack - if (null != $rule) { + if ( $found == 'open' ) { + # found opening brace, let's add it to parentheses stack $piece = array('brace' => $text[$i], 'braceEnd' => $rule['end'], - 'count' => 1, 'title' => '', 'parts' => null); - # count openning brace characters - while ($i+1 < strlen($text) && $text[$i+1] == $piece['brace']) { - $piece['count']++; - $i++; - } - - $piece['startAt'] = $i+1; - $piece['partStart'] = $i+1; + # count opening brace characters + $piece['count'] = strspn( $text, $piece['brace'], $i ); + $piece['startAt'] = $piece['partStart'] = $i + $piece['count']; + $i += $piece['count']; - # we need to add to stack only if openning brace count is enough for any given rule - foreach ($rule['cb'] as $cnt => $fn) { - if ($piece['count'] >= $cnt) { - $lastOpeningBrace ++; - $openingBraceStack[$lastOpeningBrace] = $piece; - break; + # we need to add to stack only if opening brace count is enough for one of the rules + if ( $piece['count'] >= $rule['min'] ) { + $lastOpeningBrace ++; + $openingBraceStack[$lastOpeningBrace] = $piece; + } + } elseif ( $found == 'close' ) { + # lets check if it is enough characters for closing brace + $maxCount = $openingBraceStack[$lastOpeningBrace]['count']; + $count = strspn( $text, $text[$i], $i, $maxCount ); + + # check for maximum matching characters (if there are 5 closing + # characters, we will probably need only 3 - depending on the rules) + $matchingCount = 0; + $matchingCallback = null; + $cbType = $callbacks[$openingBraceStack[$lastOpeningBrace]['brace']]; + if ( $count > $cbType['max'] ) { + # The specified maximum exists in the callback array, unless the caller + # has made an error + $matchingCount = $cbType['max']; + } else { + # Count is less than the maximum + # Skip any gaps in the callback array to find the true largest match + # Need to use array_key_exists not isset because the callback can be null + $matchingCount = $count; + while ( $matchingCount > 0 && !array_key_exists( $matchingCount, $cbType['cb'] ) ) { + --$matchingCount; } } - continue; - } - else if ($lastOpeningBrace >= 0) { - # first check if it is a closing brace - if ($openingBraceStack[$lastOpeningBrace]['braceEnd'] == $text[$i]) { - # lets check if it is enough characters for closing brace - $count = 1; - while ($i+$count < strlen($text) && $text[$i+$count] == $text[$i]) - $count++; - - # if there are more closing parentheses than opening ones, we parse less - if ($openingBraceStack[$lastOpeningBrace]['count'] < $count) - $count = $openingBraceStack[$lastOpeningBrace]['count']; - - # check for maximum matching characters (if there are 5 closing characters, we will probably need only 3 - depending on the rules) - $matchingCount = 0; - $matchingCallback = null; - foreach ($callbacks[$openingBraceStack[$lastOpeningBrace]['brace']]['cb'] as $cnt => $fn) { - if ($count >= $cnt && $matchingCount < $cnt) { - $matchingCount = $cnt; - $matchingCallback = $fn; - } - } + if ($matchingCount <= 0) { + $i += $count; + continue; + } + $matchingCallback = $cbType['cb'][$matchingCount]; - if ($matchingCount == 0) { - $i += $count - 1; - continue; - } + # let's set a title or last part (if '|' was found) + if (null === $openingBraceStack[$lastOpeningBrace]['parts']) { + $openingBraceStack[$lastOpeningBrace]['title'] = + substr($text, $openingBraceStack[$lastOpeningBrace]['partStart'], + $i - $openingBraceStack[$lastOpeningBrace]['partStart']); + } else { + $openingBraceStack[$lastOpeningBrace]['parts'][] = + substr($text, $openingBraceStack[$lastOpeningBrace]['partStart'], + $i - $openingBraceStack[$lastOpeningBrace]['partStart']); + } - # lets set a title or last part (if '|' was found) - if (null === $openingBraceStack[$lastOpeningBrace]['parts']) - $openingBraceStack[$lastOpeningBrace]['title'] = substr($text, $openingBraceStack[$lastOpeningBrace]['partStart'], $i - $openingBraceStack[$lastOpeningBrace]['partStart']); - else - $openingBraceStack[$lastOpeningBrace]['parts'][] = substr($text, $openingBraceStack[$lastOpeningBrace]['partStart'], $i - $openingBraceStack[$lastOpeningBrace]['partStart']); - - $pieceStart = $openingBraceStack[$lastOpeningBrace]['startAt'] - $matchingCount; - $pieceEnd = $i + $matchingCount; - - if( is_callable( $matchingCallback ) ) { - $cbArgs = array ( - 'text' => substr($text, $pieceStart, $pieceEnd - $pieceStart), - 'title' => trim($openingBraceStack[$lastOpeningBrace]['title']), - 'parts' => $openingBraceStack[$lastOpeningBrace]['parts'], - 'lineStart' => (($pieceStart > 0) && ($text[$pieceStart-1] == "\n")), - ); - # finally we can call a user callback and replace piece of text - wfProfileOut( __METHOD__ . '-self' ); - $replaceWith = call_user_func( $matchingCallback, $cbArgs ); - wfProfileIn( __METHOD__ . '-self' ); - $text = substr($text, 0, $pieceStart) . $replaceWith . substr($text, $pieceEnd); - $i = $pieceStart + strlen($replaceWith) - 1; - } - else { - # null value for callback means that parentheses should be parsed, but not replaced - $i += $matchingCount - 1; - } + $pieceStart = $openingBraceStack[$lastOpeningBrace]['startAt'] - $matchingCount; + $pieceEnd = $i + $matchingCount; + + if( is_callable( $matchingCallback ) ) { + $cbArgs = array ( + 'text' => substr($text, $pieceStart, $pieceEnd - $pieceStart), + 'title' => trim($openingBraceStack[$lastOpeningBrace]['title']), + 'parts' => $openingBraceStack[$lastOpeningBrace]['parts'], + 'lineStart' => (($pieceStart > 0) && ($text[$pieceStart-1] == "\n")), + ); + # finally we can call a user callback and replace piece of text + $replaceWith = call_user_func( $matchingCallback, $cbArgs ); + $text = substr($text, 0, $pieceStart) . $replaceWith . substr($text, $pieceEnd); + $i = $pieceStart + strlen($replaceWith); + } else { + # null value for callback means that parentheses should be parsed, but not replaced + $i += $matchingCount; + } - # reset last openning parentheses, but keep it in case there are unused characters - $piece = array('brace' => $openingBraceStack[$lastOpeningBrace]['brace'], - 'braceEnd' => $openingBraceStack[$lastOpeningBrace]['braceEnd'], - 'count' => $openingBraceStack[$lastOpeningBrace]['count'], - 'title' => '', - 'parts' => null, - 'startAt' => $openingBraceStack[$lastOpeningBrace]['startAt']); - $openingBraceStack[$lastOpeningBrace--] = null; - - if ($matchingCount < $piece['count']) { - $piece['count'] -= $matchingCount; - $piece['startAt'] -= $matchingCount; - $piece['partStart'] = $piece['startAt']; - # do we still qualify for any callback with remaining count? - foreach ($callbacks[$piece['brace']]['cb'] as $cnt => $fn) { - if ($piece['count'] >= $cnt) { - $lastOpeningBrace ++; - $openingBraceStack[$lastOpeningBrace] = $piece; - break; - } + # reset last opening parentheses, but keep it in case there are unused characters + $piece = array('brace' => $openingBraceStack[$lastOpeningBrace]['brace'], + 'braceEnd' => $openingBraceStack[$lastOpeningBrace]['braceEnd'], + 'count' => $openingBraceStack[$lastOpeningBrace]['count'], + 'title' => '', + 'parts' => null, + 'startAt' => $openingBraceStack[$lastOpeningBrace]['startAt']); + $openingBraceStack[$lastOpeningBrace--] = null; + + if ($matchingCount < $piece['count']) { + $piece['count'] -= $matchingCount; + $piece['startAt'] -= $matchingCount; + $piece['partStart'] = $piece['startAt']; + # do we still qualify for any callback with remaining count? + $currentCbList = $callbacks[$piece['brace']]['cb']; + while ( $piece['count'] ) { + if ( array_key_exists( $piece['count'], $currentCbList ) ) { + $lastOpeningBrace++; + $openingBraceStack[$lastOpeningBrace] = $piece; + break; } + --$piece['count']; } - continue; } - + } elseif ( $found == 'pipe' ) { # lets set a title if it is a first separator, or next part otherwise - if ($text[$i] == '|') { - if (null === $openingBraceStack[$lastOpeningBrace]['parts']) { - $openingBraceStack[$lastOpeningBrace]['title'] = substr($text, $openingBraceStack[$lastOpeningBrace]['partStart'], $i - $openingBraceStack[$lastOpeningBrace]['partStart']); - $openingBraceStack[$lastOpeningBrace]['parts'] = array(); - } - else - $openingBraceStack[$lastOpeningBrace]['parts'][] = substr($text, $openingBraceStack[$lastOpeningBrace]['partStart'], $i - $openingBraceStack[$lastOpeningBrace]['partStart']); - - $openingBraceStack[$lastOpeningBrace]['partStart'] = $i + 1; + if (null === $openingBraceStack[$lastOpeningBrace]['parts']) { + $openingBraceStack[$lastOpeningBrace]['title'] = + substr($text, $openingBraceStack[$lastOpeningBrace]['partStart'], + $i - $openingBraceStack[$lastOpeningBrace]['partStart']); + $openingBraceStack[$lastOpeningBrace]['parts'] = array(); + } else { + $openingBraceStack[$lastOpeningBrace]['parts'][] = + substr($text, $openingBraceStack[$lastOpeningBrace]['partStart'], + $i - $openingBraceStack[$lastOpeningBrace]['partStart']); } + $openingBraceStack[$lastOpeningBrace]['partStart'] = ++$i; } } - wfProfileOut( __METHOD__ . '-self' ); + wfProfileOut( __METHOD__ ); return $text; } @@ -2545,7 +2615,7 @@ class Parser return $text; } - $fname = 'Parser::replaceVariables'; + $fname = __METHOD__ /*. '-L' . count( $this->mArgStack )*/; wfProfileIn( $fname ); # This function is called recursively. To keep track of arguments we need a stack: @@ -2558,13 +2628,25 @@ class Parser if ( $this->mOutputType == OT_HTML || $this->mOutputType == OT_WIKI ) { $braceCallbacks[3] = array( &$this, 'argSubstitution' ); } - $callbacks = array(); - $callbacks['{'] = array('end' => '}', 'cb' => $braceCallbacks); - $callbacks['['] = array('end' => ']', 'cb' => array(2=>null)); - $text = $this->replace_callback ($text, $callbacks); - - array_pop( $this->mArgStack ); + if ( $braceCallbacks ) { + $callbacks = array( + '{' => array( + 'end' => '}', + 'cb' => $braceCallbacks, + 'min' => $argsOnly ? 3 : 2, + 'max' => isset( $braceCallbacks[3] ) ? 3 : 2, + ), + '[' => array( + 'end' => ']', + 'cb' => array(2=>null), + 'min' => 2, + 'max' => 2, + ) + ); + $text = $this->replace_callback ($text, $callbacks); + array_pop( $this->mArgStack ); + } wfProfileOut( $fname ); return $text; } @@ -2636,8 +2718,9 @@ class Parser */ function braceSubstitution( $piece ) { global $wgContLang, $wgLang, $wgAllowDisplayTitle, $action; - $fname = 'Parser::braceSubstitution'; + $fname = __METHOD__ /*. '-L' . count( $this->mArgStack )*/; wfProfileIn( $fname ); + wfProfileIn( __METHOD__.'-setup' ); # Flags $found = false; # $text has been filled @@ -2671,8 +2754,10 @@ class Parser $args = (null == $piece['parts']) ? array() : $piece['parts']; $argc = count( $args ); + wfProfileOut( __METHOD__.'-setup' ); # SUBST + wfProfileIn( __METHOD__.'-modifiers' ); if ( !$found ) { $mwSubst =& MagicWord::get( 'subst' ); if ( $mwSubst->matchStartAndRemove( $part1 ) xor ($this->mOutputType == OT_WIKI) ) { @@ -2714,6 +2799,7 @@ class Parser } } } + wfProfileOut( __METHOD__.'-modifiers' ); # Parser functions if ( !$found ) { @@ -2776,7 +2862,7 @@ class Parser $text = $linestart . '{{' . $part1 . '}}' . ''; - wfDebug( "$fname: template loop broken at '$part1'\n" ); + wfDebug( __METHOD__.": template loop broken at '$part1'\n" ); } else { # set $text to cached message. $text = $linestart . $this->mTemplates[$piece['title']]; @@ -3374,137 +3460,6 @@ class Parser } } - /** - * Return an HTML link for the "ISBN 123456" text - * @private - */ - function magicISBN( $text ) { - $fname = 'Parser::magicISBN'; - wfProfileIn( $fname ); - - $a = split( 'ISBN ', ' '.$text ); - if ( count ( $a ) < 2 ) { - wfProfileOut( $fname ); - return $text; - } - $text = substr( array_shift( $a ), 1); - $valid = '0123456789-Xx'; - - foreach ( $a as $x ) { - # hack: don't replace inside thumbnail title/alt - # attributes - if(preg_match('/<[^>]+(alt|title)="[^">]*$/', $text)) { - $text .= "ISBN $x"; - continue; - } - - $isbn = $blank = '' ; - while ( $x !== '' && ' ' == $x{0} ) { - $blank .= ' '; - $x = substr( $x, 1 ); - } - if ( $x == '' ) { # blank isbn - $text .= "ISBN $blank"; - continue; - } - while ( strstr( $valid, $x{0} ) != false ) { - $isbn .= $x{0}; - $x = substr( $x, 1 ); - } - $num = str_replace( '-', '', $isbn ); - $num = str_replace( ' ', '', $num ); - $num = str_replace( 'x', 'X', $num ); - - if ( '' == $num ) { - $text .= "ISBN $blank$x"; - } else { - $titleObj = Title::makeTitle( NS_SPECIAL, 'Booksources' ); - $text .= 'ISBN $isbn"; - $text .= $x; - } - } - wfProfileOut( $fname ); - return $text; - } - - /** - * Return an HTML link for the "RFC 1234" text - * - * @private - * @param string $text Text to be processed - * @param string $keyword Magic keyword to use (default RFC) - * @param string $urlmsg Interface message to use (default rfcurl) - * @return string - */ - function magicRFC( $text, $keyword='RFC ', $urlmsg='rfcurl' ) { - - $valid = '0123456789'; - $internal = false; - - $a = split( $keyword, ' '.$text ); - if ( count ( $a ) < 2 ) { - return $text; - } - $text = substr( array_shift( $a ), 1); - - /* Check if keyword is preceed by [[. - * This test is made here cause of the array_shift above - * that prevent the test to be done in the foreach. - */ - if ( substr( $text, -2 ) == '[[' ) { - $internal = true; - } - - foreach ( $a as $x ) { - /* token might be empty if we have RFC RFC 1234 */ - if ( $x=='' ) { - $text.=$keyword; - continue; - } - - # hack: don't replace inside thumbnail title/alt - # attributes - if(preg_match('/<[^>]+(alt|title)="[^">]*$/', $text)) { - $text .= $keyword . $x; - continue; - } - - $id = $blank = '' ; - - /** remove and save whitespaces in $blank */ - while ( $x{0} == ' ' ) { - $blank .= ' '; - $x = substr( $x, 1 ); - } - - /** remove and save the rfc number in $id */ - while ( strstr( $valid, $x{0} ) != false ) { - $id .= $x{0}; - $x = substr( $x, 1 ); - } - - if ( $id == '' ) { - /* call back stripped spaces*/ - $text .= $keyword.$blank.$x; - } elseif( $internal ) { - /* normal link */ - $text .= $keyword.$id.$x; - } else { - /* build the external link*/ - $url = wfMsg( $urlmsg, $id); - $sk =& $this->mOptions->getSkin(); - $la = $sk->getExternalLinkAttributes( $url, $keyword.$id ); - $text .= "{$keyword}{$id}{$x}"; - } - - /* Check if the next RFC keyword is preceed by [[ */ - $internal = ( substr($x,-2) == '[[' ); - } - return $text; - } - /** * Transform wiki markup when saving a page by doing \r\n -> \n * conversion, substitting signatures, {{subst:}} templates, etc. diff --git a/includes/Sanitizer.php b/includes/Sanitizer.php index a4ab66229a..185679f6cd 100644 --- a/includes/Sanitizer.php +++ b/includes/Sanitizer.php @@ -329,75 +329,88 @@ class Sanitizer { */ static function removeHTMLtags( $text, $processCallback = null, $args = array() ) { global $wgUseTidy, $wgUserHtml; - $fname = 'Parser::removeHTMLtags'; - wfProfileIn( $fname ); - - if( $wgUserHtml ) { - $htmlpairs = array( # Tags that must be closed - 'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1', - 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's', - 'strike', 'strong', 'tt', 'var', 'div', 'center', - 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre', - 'ruby', 'rt' , 'rb' , 'rp', 'p', 'span', 'u' - ); - $htmlsingle = array( - 'br', 'hr', 'li', 'dt', 'dd' - ); - $htmlsingleonly = array( # Elements that cannot have close tags - 'br', 'hr' - ); - $htmlnest = array( # Tags that can be nested--?? - 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul', - 'dl', 'font', 'big', 'small', 'sub', 'sup', 'span' - ); - $tabletags = array( # Can only appear inside table - 'td', 'th', 'tr', - ); - $htmllist = array( # Tags used by list - 'ul','ol', - ); - $listtags = array( # Tags that can appear in a list - 'li', - ); - } else { - $htmlpairs = array(); - $htmlsingle = array(); - $htmlnest = array(); - $tabletags = array(); - } + static $htmlpairs, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags, + $htmllist, $listtags, $htmlsingleallowed, $htmlelements, $staticInitialised; + + wfProfileIn( __METHOD__ ); + + if ( !$staticInitialised ) { + if( $wgUserHtml ) { + $htmlpairs = array( # Tags that must be closed + 'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1', + 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's', + 'strike', 'strong', 'tt', 'var', 'div', 'center', + 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre', + 'ruby', 'rt' , 'rb' , 'rp', 'p', 'span', 'u' + ); + $htmlsingle = array( + 'br', 'hr', 'li', 'dt', 'dd' + ); + $htmlsingleonly = array( # Elements that cannot have close tags + 'br', 'hr' + ); + $htmlnest = array( # Tags that can be nested--?? + 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul', + 'dl', 'font', 'big', 'small', 'sub', 'sup', 'span' + ); + $tabletags = array( # Can only appear inside table + 'td', 'th', 'tr', + ); + $htmllist = array( # Tags used by list + 'ul','ol', + ); + $listtags = array( # Tags that can appear in a list + 'li', + ); + + } else { + $htmlpairs = array(); + $htmlsingle = array(); + $htmlnest = array(); + $tabletags = array(); + } + + $htmlsingleallowed = array_merge( $htmlsingle, $tabletags ); + $htmlelements = array_merge( $htmlsingle, $htmlpairs, $htmlnest ); - $htmlsingleallowed = array_merge( $htmlsingle, $tabletags ); - $htmlelements = array_merge( $htmlsingle, $htmlpairs, $htmlnest ); + # Convert them all to hashtables for faster lookup + $vars = array( 'htmlpairs', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags', + 'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelements' ); + foreach ( $vars as $var ) { + $$var = array_flip( $$var ); + } + $staticInitialised = true; + } # Remove HTML comments $text = Sanitizer::removeHTMLcomments( $text ); $bits = explode( '<', $text ); $text = array_shift( $bits ); if(!$wgUseTidy) { - $tagstack = array(); $tablestack = array(); + $tagstack = $tablestack = array(); foreach ( $bits as $x ) { $prev = error_reporting( E_ALL & ~( E_NOTICE | E_WARNING ) ); - preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/', - $x, $regs ); + preg_match( '!^(/?)(\\w+)([^>]*?)(/{0,1}>)([^<]*)$!', $x, $regs ); list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs; error_reporting( $prev ); $badtag = 0 ; - if ( in_array( $t = strtolower( $t ), $htmlelements ) ) { + if ( isset( $htmlelements[$t = strtolower( $t )] ) ) { # Check our stack if ( $slash ) { # Closing a tag... - if( in_array( $t, $htmlsingleonly ) ) { + if( isset( $htmlsingleonly[$t] ) ) { $badtag = 1; } elseif ( ( $ot = @array_pop( $tagstack ) ) != $t ) { - if ( in_array($ot, $htmlsingleallowed) ) { + if ( isset( $htmlsingleallowed[$ot] ) ) { # Pop all elements with an optional close tag # and see if we find a match below them $optstack = array(); array_push ($optstack, $ot); while ( ( ( $ot = @array_pop( $tagstack ) ) != $t ) && - in_array($ot, $htmlsingleallowed) ) { + isset( $htmlsingleallowed[$ot] ) ) + { array_push ($optstack, $ot); } if ( $t != $ot ) { @@ -410,7 +423,7 @@ class Sanitizer { } else { @array_push( $tagstack, $ot ); #
  • can be nested in
      or
        , skip those cases: - if(!(in_array($ot, $htmllist) && in_array($t, $listtags) )) { + if(!(isset( $htmllist[$ot] ) && isset( $listtags[$t] ) )) { $badtag = 1; } } @@ -422,20 +435,20 @@ class Sanitizer { $newparams = ''; } else { # Keep track for later - if ( in_array( $t, $tabletags ) && + if ( isset( $tabletags[$t] ) && ! in_array( 'table', $tagstack ) ) { $badtag = 1; } else if ( in_array( $t, $tagstack ) && - ! in_array ( $t , $htmlnest ) ) { + ! isset( $htmlnest [$t ] ) ) { $badtag = 1 ; # Is it a self closed htmlpair ? (bug 5487) } else if( $brace == '/>' && - in_array($t, $htmlpairs) ) { + isset( $htmlpairs[$t] ) ) { $badtag = 1; - } elseif( in_array( $t, $htmlsingleonly ) ) { + } elseif( isset( $htmlsingleonly[$t] ) ) { # Hack to force empty tag for uncloseable elements $brace = '/>'; - } else if( in_array( $t, $htmlsingle ) ) { + } else if( isset( $htmlsingle[$t] ) ) { # Hack to not close $htmlsingle tags $brace = NULL; } else { @@ -475,7 +488,7 @@ class Sanitizer { preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/', $x, $regs ); @list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs; - if ( in_array( $t = strtolower( $t ), $htmlelements ) ) { + if ( isset( $htmlelements[$t = strtolower( $t )] ) ) { if( is_callable( $processCallback ) ) { call_user_func_array( $processCallback, array( &$params, $args ) ); } @@ -487,7 +500,7 @@ class Sanitizer { } } } - wfProfileOut( $fname ); + wfProfileOut( __METHOD__ ); return $text; } @@ -502,8 +515,7 @@ class Sanitizer { * @return string */ static function removeHTMLcomments( $text ) { - $fname='Parser::removeHTMLcomments'; - wfProfileIn( $fname ); + wfProfileIn( __METHOD__ ); while (($start = strpos($text, '', $start + 4); if ($end === false) { @@ -533,7 +545,7 @@ class Sanitizer { $text = substr_replace($text, '', $start, $end - $start); } } - wfProfileOut( $fname ); + wfProfileOut( __METHOD__ ); return $text; } diff --git a/maintenance/parserTests.txt b/maintenance/parserTests.txt index cce844ecec..e4f1971f69 100644 --- a/maintenance/parserTests.txt +++ b/maintenance/parserTests.txt @@ -2808,7 +2808,7 @@ Thumbnail image caption with a free URL !! end !! test -TODO: BUG 1887: A ISBN with a thumbnail +BUG 1887: A ISBN with a thumbnail !! input [[Image:foobar.jpg|thumb|ISBN 12354]] !! result -- 2.20.1